Read the dataset

data = read.csv("stock_price.csv")

Visualizing the Principal Components of Stocks

  1. Use PCA to reduce the dimension of stock-price information. Generate a screenplot and determine the number of principal components based on this plot. Plot the loadings of the first principal component.
pca_stocks=prcomp(data, scale=TRUE)
plot(pca_stocks,main="Stocks Principal Component Analysis") ## same as screeplot(pcafood)
mtext(side=1, "Stocks Principal Components",  line=1, font=2)

stocks = predict(pca_stocks)

loadings for first Principal Component

plot(pca_stocks$rotation[,1],type='l', main = "The loadings for first principal component")

2) Generate scatter plots for principal component 1 and principal component 2 ## Principal Component 1 vs Principal Component 2

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
loadings=as.vector(pca_stocks[2]$rotation[,c("PC1")])
stocks=predict(pca_stocks)
stocks=as.data.frame.matrix(stocks)
library(car) 
## Loading required package: carData
plot(stocks[,c("PC1","PC2")], xlab="Principal Component 1", ylab="Principal Component 2", main="PC1 vs PC2")

3) Generate an MDS plot:

Taking the distance matrix and standardizing the data.

labels=seq(1, 127, by=1)
new_data=stocks[,c("PC1","PC2")];
data.dist=dist(stocks)
data.mds <- cmdscale(data.dist)
plot(data.mds, type = 'n')
text(data.mds)

biplot(pca_stocks)

par(mfrow=c(2,2))
plot(pca_stocks$x[,1],xlab="index",ylab="PC1",main="Principal Component 1")
plot(pca_stocks$x[,2],xlab="index",ylab="PC2",main="Principal Component 2")
plot(pca_stocks$x[,3],xlab="index",ylab="PC3",main="Principal Component 3")
plot(pca_stocks$x[,4],xlab="index",ylab="PC4",main="Principal Component 4")

## Creating functions for Kmeans Clustering, Hierarchial Clustering

#
# 
#
#
library("ape")
library("zoom")
do.kmeans <- function(data,labels,k=3,do.scatter=F) {
  heading=paste(c("K-means with Clusters", k), collapse = " ")
  print(heading)
  set.seed(123)
  data.clu = kmeans(data, centers=k, nstart=10)
  if (do.scatter) {
    plot(dataset,type='n')
    text(dataset,labels=labels,col=rainbow(k)[data.clu$cluster])    
  }
  print(data.clu)
  data.clu
}

do.hclust <- function(data, methodName ='single',labels,k=3,do.dendrogram=F) {
  heading=paste(c("Hierarchial Clustering with", k," Clusters and Method name",methodName), collapse = " ")
  print(heading)
  data.dist = dist(data)  
  hc = hclust(data.dist,method= methodName) 
  colors = c("red", "blue", "green")
  clus = cutree(hc, 3)
  if(k==2){
  colors = c("red", "blue")
  clus = cutree(hc, 2)
  } else if(k==6){
    colors = c("red", "blue", "green","purple", "orange", "green")
    clus = cutree(hc, 6)
  }
  if (do.dendrogram) {
    layout(matrix(c(1, 1, 1,
                1, 1, 1,
                1, 1, 1), nr=3, byrow=T))
    title=paste(c("Dendogram with", k,"Clusters and method is", methodName), collapse = " ")
    plot(as.phylo(hc), type = "fan",main=title, tip.color = colors[clus],label.offset = 1, cex = 0.9)
    #zm();
  }
  hc1 = cutree(hc,k)
  print(hc1)
  hc1
}

do.mdsplot <- function(data,labels,clusters,methodName,clusteredlabels){
  title=paste(c(methodName,"Clustering with", clusters," Cluster/Clusters"), collapse = " ")
  plot(data, type = "n",ylim=c(-10,150),main=title)
  text(data[,1], labels, col = rainbow(clusters)[clusteredlabels])
}
  1. Use different clustering algorithms and generate 8 MDS plots.
clu3_kmeans = do.kmeans(data, labels, k = 3)$cluster
## [1] "K-means with Clusters 3"
## K-means clustering with 3 clusters of sizes 37, 65, 25
## 
## Cluster means:
##         AA      AXP       BA      BAC       CAT     CSCO       CVX
## 1 16.20000 44.78486 70.50784 13.92324  97.37865 19.48216  95.71838
## 2 16.73554 47.73769 74.81431 12.58369 106.14031 17.10477 103.95338
## 3  0.01960 -0.22440 -0.35400  0.12400  -0.08480  0.22320  -0.29480
##         DD      DIS       GE       HD      HPQ      IBM     INTC      JNJ
## 1 50.88703 40.16486 19.59081 36.03676 44.08514 156.8776 20.99757 61.27676
## 2 53.74769 41.70738 19.88554 36.85415 40.05969 166.1420 21.75446 63.41246
## 3 -0.16680 -0.05520  0.05040 -0.02680  0.20040  -0.8992 -0.03080 -0.15920
##        JPM     KRFT       KO      MCD      MMM      MRK     MSFT      PFE
## 1 44.39514 63.31865 31.00378 75.00649 88.79135 33.80378 26.99838 18.82270
## 2 44.45154 66.50323 33.21785 78.68800 93.31262 34.74800 25.51031 20.35154
## 3  0.03320 -0.04400 -0.17400 -0.14840 -0.16800  0.01960  0.12560 -0.10560
##         PG        T      TRV      UTX       VZ      WMT      XOM
## 1 63.64432 28.37946 56.51405 80.98784 35.70378 54.19054 79.48324
## 2 64.04800 30.34000 60.56123 85.56692 36.94031 53.73800 83.33677
## 3  0.01680 -0.01400 -0.04960 -0.23680  0.05120 -0.04040 -0.15320
## 
## Clustering vector:
##   [1] 1 1 1 1 3 1 1 1 1 3 1 1 1 1 3 1 1 1 1 3 1 1 1 1 3 1 2 1 1 3 1 2 1 2 3
##  [36] 2 2 1 2 3 2 2 1 2 3 2 2 1 1 3 1 2 1 1 3 1 2 1 2 3 2 2 2 2 3 2 2 2 2 3
##  [71] 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3
## [106] 2 2 2 2 3 2 2 1 2 3 2 2 1 2 3 2 2 1 2 3 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 5449.840 9493.242 1516.377
##  (between_SS / total_SS =  99.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
clu6_kmeans = do.kmeans(data, labels, k = 6)$cluster
## [1] "K-means with Clusters 6"
## K-means clustering with 6 clusters of sizes 20, 18, 18, 28, 25, 18
## 
## Cluster means:
##         AA      AXP       BA      BAC       CAT     CSCO       CVX
## 1 17.26700 46.05800 73.76800 13.32200 109.42500 17.40650 107.52100
## 2 16.07444 44.62944 69.62222 14.22444  94.28667 20.65333  92.77667
## 3 17.13667 50.39444 77.72944 12.04722 109.36667 17.03667 104.62167
## 4 16.69821 44.67143 71.49393 14.19250 102.46214 18.83464  99.96929
## 5  0.01960 -0.22440 -0.35400  0.12400  -0.08480  0.22320  -0.29480
## 6 15.36222 48.75556 74.56667 10.91000  98.82944 15.48500  99.76778
##         DD      DIS       GE       HD      HPQ      IBM     INTC      JNJ
## 1 55.09850 42.50850 20.22500 37.65350 41.28000 164.8375 20.48550 60.67100
## 2 49.37389 39.13000 19.11278 35.70500 45.10111 151.9250 20.90111 61.73167
## 3 54.30444 42.23444 19.98056 36.98167 39.44111 170.2706 23.08444 66.26111
## 4 53.55536 42.25036 20.53143 37.23929 44.70250 162.1479 21.22071 60.23857
## 5 -0.16680 -0.05520  0.05040 -0.02680  0.20040  -0.8992 -0.03080 -0.15920
## 6 50.48278 38.85222 18.57556 34.70833 35.33333 164.8494 21.96222 65.83778
##        JPM     KRFT       KO      MCD      MMM      MRK     MSFT      PFE
## 1 46.01900 66.87050 32.18000 76.48400 93.28500 33.52650 25.77150 20.32100
## 2 44.21944 63.04611 31.08278 74.33222 87.47667 34.60222 27.90556 18.25278
## 3 44.12222 67.60278 34.50333 80.67278 95.60222 36.56056 25.47000 20.87778
## 4 45.85571 64.00964 31.22821 75.24179 90.85393 32.77750 26.52429 19.32679
## 5  0.03320 -0.04400 -0.17400 -0.14840 -0.16800  0.01960  0.12560 -0.10560
## 6 40.97111 65.78556 33.76444 81.30111 91.42056 35.56278 24.34667 20.40944
##         PG        T      TRV      UTX       VZ      WMT      XOM
## 1 62.48300 30.12200 59.91950 84.92150 37.63450 52.92450 85.05900
## 2 64.38778 28.30278 54.89833 79.55000 35.47944 54.80667 77.24333
## 3 66.03389 31.44611 62.62222 88.59222 37.41278 55.10167 83.45667
## 4 62.99536 28.32429 58.84250 82.81607 36.21464 53.66679 83.28786
## 5  0.01680 -0.01400 -0.04960 -0.23680  0.05120 -0.04040 -0.15320
## 6 64.26889 30.61889 59.23056 84.14222 35.74444 53.25056 79.55167
## 
## Clustering vector:
##   [1] 2 2 2 2 5 2 2 2 2 5 2 2 2 2 5 2 4 2 2 5 2 4 2 4 5 4 4 4 4 5 4 4 4 4 5
##  [36] 4 4 4 4 5 4 1 4 4 5 4 1 4 4 5 4 4 2 4 5 4 1 4 1 5 1 1 1 1 5 1 1 1 1 5
##  [71] 1 1 4 1 5 1 1 4 1 5 1 3 1 3 5 3 3 3 3 5 3 3 3 3 5 3 3 6 3 5 6 3 6 3 5
## [106] 3 3 6 6 5 6 6 6 6 5 6 6 6 6 5 6 6 6 6 5 6 3
## 
## Within cluster sum of squares by cluster:
## [1]  730.2266 1130.2217 1212.3933 1935.9846 1516.3769  791.3674
##  (between_SS / total_SS =  99.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
do.mdsplot(data=data.mds,labels = labels,clusters=3,methodName = "Kmeans",clusteredlabels = clu3_kmeans)

do.mdsplot(data=data.mds,labels = labels,clusters=6,methodName = "Kmeans",clusteredlabels = clu6_kmeans)

clu3_hclust_single = do.hclust(data.mds, methodName = "single",labels, k = 3, do.dendrogram = T)
## [1] "Hierarchial Clustering with 3  Clusters and Method name single"

##   [1] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
##  [36] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 3 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
##  [71] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
## [106] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1
clu6_hclust_single = do.hclust(data.mds, methodName = "single",labels, k = 6, do.dendrogram = T)
## [1] "Hierarchial Clustering with 6  Clusters and Method name single"

##   [1] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 3 1 1 2 1 1 1 1 2
##  [36] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 4 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
##  [71] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 5 1 1 2 1 1 1 1 2
## [106] 1 1 1 1 6 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1
clu3_hclust_complete = do.hclust(data.mds, methodName = "complete",labels, k = 3, do.dendrogram = T)
## [1] "Hierarchial Clustering with 3  Clusters and Method name complete"

##   [1] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
##  [36] 1 1 1 3 2 3 3 1 3 2 3 3 1 3 2 1 3 1 1 2 1 3 1 3 2 3 3 3 3 2 3 3 3 3 2
##  [71] 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2
## [106] 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3
clu6_hclust_complete = do.hclust(data.mds, methodName = "complete",labels, k = 6, do.dendrogram = T)
## [1] "Hierarchial Clustering with 6  Clusters and Method name complete"

##   [1] 1 2 1 1 3 1 2 1 2 3 2 2 1 2 3 2 2 1 2 3 2 2 1 2 3 2 2 2 2 3 2 2 2 2 3
##  [36] 2 2 1 4 3 4 4 1 4 3 4 4 1 4 3 1 4 1 1 3 1 4 1 4 3 4 4 4 4 3 4 4 4 4 3
##  [71] 4 4 4 4 3 4 5 4 5 3 5 5 4 5 3 5 5 5 5 3 5 5 5 5 3 5 5 6 6 3 6 6 6 6 3
## [106] 6 5 6 6 3 6 6 6 6 3 6 6 6 6 3 6 6 6 6 3 6 6
clu3_hclust_average = do.hclust(data.mds, methodName = "average",labels, k = 3, do.dendrogram = T)
## [1] "Hierarchial Clustering with 3  Clusters and Method name average"

##   [1] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
##  [36] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
##  [71] 1 1 1 1 2 1 3 1 3 2 3 3 1 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2
## [106] 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3
clu6_hclust_average = do.hclust(data.mds, methodName = "average",labels, k = 6, do.dendrogram = T)
## [1] "Hierarchial Clustering with 6  Clusters and Method name average"

##   [1] 1 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3
##  [36] 2 2 1 1 3 1 1 1 1 3 1 1 1 1 3 1 1 4 1 3 1 1 1 1 3 1 1 1 1 3 1 1 1 1 3
##  [71] 1 1 1 1 3 1 5 1 5 3 5 5 1 5 3 5 5 5 5 3 5 5 5 5 3 5 5 6 5 3 6 5 6 5 3
## [106] 5 5 6 6 3 6 6 6 6 3 6 6 6 6 3 6 6 6 6 3 6 5
do.mdsplot(data=data.mds,labels = labels,clusters=3,methodName = "HClust-single",clusteredlabels = clu3_hclust_single)

do.mdsplot(data=data.mds,labels = labels,clusters=6,methodName = "HClust-single",clusteredlabels = clu6_hclust_single)

do.mdsplot(data=data.mds,labels = labels,clusters=3,methodName = "HClust-complete" ,clusteredlabels = clu3_hclust_complete)

do.mdsplot(data=data.mds,labels = labels,clusters=6,methodName = "HClust-complete" ,clusteredlabels = clu6_hclust_complete)

do.mdsplot(data=data.mds,labels = labels,clusters=3,methodName = "HClust-average" ,clusteredlabels = clu3_hclust_average)

do.mdsplot(data=data.mds,labels = labels,clusters=6,methodName = "HClust-average" ,clusteredlabels = clu6_hclust_average)

# Senator Data

library("foreign")
raw_data=read.dta("sen113kh.dta")
data=read.dta("sen113kh.dta")
data=data[,10:length(colnames(data))]
#sen_data=prcomp(data, scale=TRUE)
#plot(sen_data,main="113th Congress Data") ## same as screeplot(pcafood)
#mtext(side=1, "Principal Components",  line=1, font=2)
new_data=data$x[,1:2]
data.dist=dist(data)
data.mds <- cmdscale(data.dist)
do.mdsplot(data=data.mds,labels = labels,clusters=2,methodName = "Democrats and Republicans",clusteredlabels = as.matrix(raw_data["party"]/100))

clu2_hclust_single = do.hclust(data.mds, methodName = "single",labels, k = 2, do.dendrogram = T)
## [1] "Hierarchial Clustering with 2  Clusters and Method name single"

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2
clu2_hclust_complete = do.hclust(data.mds, methodName = "complete",labels, k = 2, do.dendrogram = T)
## [1] "Hierarchial Clustering with 2  Clusters and Method name complete"

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1   1   1   2   2   1   1   2   1   2   2   2   2   2   2   2   2   1 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   1   1   2   2   1   1   2   1   1   2   1   2   1   1   1   1   1 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   1   1   2   1   2 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   2   2   1   1   1   2   1   2   2   2   2   2   2   2   2   2   1   2 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   2   1   2   1   1   1   2   2   2   1   2   2   1   1   1   2   1   1 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 
##   1   1   1   1   2   2   2   2   2   2   2   2   1   2   1   1
clu2_hclust_average = do.hclust(data.mds, methodName = "average",labels, k = 2, do.dendrogram = T)
## [1] "Hierarchial Clustering with 2  Clusters and Method name average"

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2
clu2_kmeans=do.kmeans(data.mds, labels, k = 2)$cluster
## [1] "K-means with Clusters 2"
## K-means clustering with 2 clusters of sizes 45, 61
## 
## Cluster means:
##        [,1]        [,2]
## 1 -54.02422  0.03559439
## 2  39.85393 -0.02625816
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1   1   1   1   2   1   1   2   1   2   2   2   2   2   2   2   2   1 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   1   1   2   2   1   1   2   1   1   2   1   2   1   1   1   1   1 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   1   1   2   1   2 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   2   2   1   1   1   2   1   2   2   2   2   2   2   2   2   2   1   2 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   2   1   2   1   1   1   2   2   2   1   2   2   1   1   1   2   1   1 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 
##   1   1   1   1   2   2   2   2   2   2   2   2   1   2   1   1 
## 
## Within cluster sum of squares by cluster:
## [1] 16329.72 15305.68
##  (between_SS / total_SS =  87.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Task 2: Analyze US Senator Roll Call Data. The objective is to identify and visualize the clustering patterns of senators voting activities.

do.mdsplot(data=data.mds,labels = labels,clusters=2,methodName = "HClust-single",clusteredlabels = clu2_hclust_single)

do.mdsplot(data=data.mds,labels = labels,clusters=2,methodName = "HClust-complete" ,clusteredlabels = clu2_hclust_complete)

do.mdsplot(data=data.mds,labels = labels,clusters=2,methodName = "HClust-average" ,clusteredlabels = clu2_hclust_average)

do.mdsplot(data=data.mds,labels = labels,clusters=2,methodName = "Kmeans",clusteredlabels = clu2_kmeans)

2) Use k-means and hierarchial clustering to group the senators and color the senators on the MDS plots based on the clustering results.

raw_data$cluster_kmeans=clu2_kmeans
raw_data$cluster_hclust_single=clu2_hclust_single
raw_data$cluster_hclust_complete=clu2_hclust_complete
raw_data$cluster_hclust_average=clu2_hclust_average
do.check <- function(party,cluster,clusterMethod="cluster_kmeans"){
  TP=0;
  TN=0;
  FP=0;
  FN=0;
  n=nrow(raw_data);
  for(x in 1:n){
    if(raw_data[x,clusterMethod]==cluster && raw_data[x,"party"]==party){
      TP=TP+1;
    }
    else if(raw_data[x,clusterMethod]!=cluster && raw_data[x,"party"]!=party){
      TN=TN+1;
    }
    else if(raw_data[x,clusterMethod]==cluster && raw_data[x,"party"]!=party){
      FP=FP+1;
      print("Democrat conidered as a republican on index")
      print(x)
    }
    else{
      FN=FN+1;
      print("Republican conidered as a democrat on index")
      print(x)
    }
  }
  val=c(TP,TN,FP,FN)
  return(val)
}
  1. compare the clustering results with the party labels and identify the party members who are assigned to a seemly wrong cluster.
#raw_data[c("cluster_kmeans","party")]
#cluster_kmeans=raw_data["cluster_kmeans"];
do.confusionMatrix <-  function(cluster="cluster_kmeans"){
n=nrow(raw_data["party"]);
counter=0;
for(x in 1:n){
  #print(x)
  #print(raw_data[x,cluster]);
  #print(raw_data[x,"party"]);
  if(raw_data[x,cluster]==1 && raw_data[x,"party"]==100){
      counter=counter+1;
  }
  else{
    counter=counter-1;
  }
}
if(counter>=0){
    print("100 is Cluster 1")
    conf_mat=do.check("100","1",clusterMethod=cluster)
  } else{
    print("100 is Cluster 2");
    conf_mat=do.check("100","2",clusterMethod=cluster)
  }
  return(conf_mat)
}

TP TN FP FN

conf_mat_kmeans=do.confusionMatrix("cluster_kmeans")
## [1] "100 is Cluster 2"
## [1] "Republican conidered as a democrat on index"
## [1] 1
## [1] "Democrat conidered as a republican on index"
## [1] 38
## [1] "Democrat conidered as a republican on index"
## [1] 39
## [1] "Democrat conidered as a republican on index"
## [1] 65
## [1] "Democrat conidered as a republican on index"
## [1] 95
conf_mat_kmeans
## [1] 57 44  4  1
conf_hclust_single=do.confusionMatrix("cluster_hclust_single")
## [1] "100 is Cluster 2"
## [1] "Republican conidered as a democrat on index"
## [1] 1
## [1] "Democrat conidered as a republican on index"
## [1] 2
## [1] "Democrat conidered as a republican on index"
## [1] 3
## [1] "Democrat conidered as a republican on index"
## [1] 4
## [1] "Democrat conidered as a republican on index"
## [1] 6
## [1] "Democrat conidered as a republican on index"
## [1] 7
## [1] "Democrat conidered as a republican on index"
## [1] 9
## [1] "Democrat conidered as a republican on index"
## [1] 18
## [1] "Democrat conidered as a republican on index"
## [1] 20
## [1] "Democrat conidered as a republican on index"
## [1] 21
## [1] "Democrat conidered as a republican on index"
## [1] 24
## [1] "Democrat conidered as a republican on index"
## [1] 25
## [1] "Democrat conidered as a republican on index"
## [1] 27
## [1] "Democrat conidered as a republican on index"
## [1] 28
## [1] "Democrat conidered as a republican on index"
## [1] 30
## [1] "Democrat conidered as a republican on index"
## [1] 32
## [1] "Democrat conidered as a republican on index"
## [1] 33
## [1] "Democrat conidered as a republican on index"
## [1] 34
## [1] "Democrat conidered as a republican on index"
## [1] 35
## [1] "Democrat conidered as a republican on index"
## [1] 36
## [1] "Democrat conidered as a republican on index"
## [1] 38
## [1] "Democrat conidered as a republican on index"
## [1] 39
## [1] "Democrat conidered as a republican on index"
## [1] 50
## [1] "Democrat conidered as a republican on index"
## [1] 51
## [1] "Democrat conidered as a republican on index"
## [1] 53
## [1] "Democrat conidered as a republican on index"
## [1] 57
## [1] "Democrat conidered as a republican on index"
## [1] 58
## [1] "Democrat conidered as a republican on index"
## [1] 59
## [1] "Democrat conidered as a republican on index"
## [1] 61
## [1] "Democrat conidered as a republican on index"
## [1] 65
## [1] "Democrat conidered as a republican on index"
## [1] 71
## [1] "Democrat conidered as a republican on index"
## [1] 74
## [1] "Democrat conidered as a republican on index"
## [1] 76
## [1] "Democrat conidered as a republican on index"
## [1] 77
## [1] "Democrat conidered as a republican on index"
## [1] 78
## [1] "Democrat conidered as a republican on index"
## [1] 82
## [1] "Democrat conidered as a republican on index"
## [1] 85
## [1] "Democrat conidered as a republican on index"
## [1] 86
## [1] "Democrat conidered as a republican on index"
## [1] 87
## [1] "Democrat conidered as a republican on index"
## [1] 89
## [1] "Democrat conidered as a republican on index"
## [1] 90
## [1] "Democrat conidered as a republican on index"
## [1] 91
## [1] "Democrat conidered as a republican on index"
## [1] 92
## [1] "Democrat conidered as a republican on index"
## [1] 93
## [1] "Democrat conidered as a republican on index"
## [1] 94
## [1] "Democrat conidered as a republican on index"
## [1] 95
## [1] "Democrat conidered as a republican on index"
## [1] 103
## [1] "Democrat conidered as a republican on index"
## [1] 105
## [1] "Democrat conidered as a republican on index"
## [1] 106
conf_hclust_single
## [1] 57  0 48  1
conf_hclust_complete=do.confusionMatrix("cluster_hclust_complete")
## [1] "100 is Cluster 2"
## [1] "Republican conidered as a democrat on index"
## [1] 1
## [1] "Democrat conidered as a republican on index"
## [1] 4
## [1] "Democrat conidered as a republican on index"
## [1] 38
## [1] "Democrat conidered as a republican on index"
## [1] 39
## [1] "Democrat conidered as a republican on index"
## [1] 65
## [1] "Democrat conidered as a republican on index"
## [1] 95
conf_hclust_complete
## [1] 57 43  5  1
conf_hclust_average=do.confusionMatrix("cluster_hclust_average")
## [1] "100 is Cluster 2"
## [1] "Republican conidered as a democrat on index"
## [1] 1
## [1] "Democrat conidered as a republican on index"
## [1] 2
## [1] "Democrat conidered as a republican on index"
## [1] 3
## [1] "Democrat conidered as a republican on index"
## [1] 4
## [1] "Democrat conidered as a republican on index"
## [1] 6
## [1] "Democrat conidered as a republican on index"
## [1] 7
## [1] "Democrat conidered as a republican on index"
## [1] 9
## [1] "Democrat conidered as a republican on index"
## [1] 18
## [1] "Democrat conidered as a republican on index"
## [1] 20
## [1] "Democrat conidered as a republican on index"
## [1] 21
## [1] "Democrat conidered as a republican on index"
## [1] 24
## [1] "Democrat conidered as a republican on index"
## [1] 25
## [1] "Democrat conidered as a republican on index"
## [1] 27
## [1] "Democrat conidered as a republican on index"
## [1] 28
## [1] "Democrat conidered as a republican on index"
## [1] 30
## [1] "Democrat conidered as a republican on index"
## [1] 32
## [1] "Democrat conidered as a republican on index"
## [1] 33
## [1] "Democrat conidered as a republican on index"
## [1] 34
## [1] "Democrat conidered as a republican on index"
## [1] 35
## [1] "Democrat conidered as a republican on index"
## [1] 36
## [1] "Democrat conidered as a republican on index"
## [1] 38
## [1] "Democrat conidered as a republican on index"
## [1] 39
## [1] "Democrat conidered as a republican on index"
## [1] 50
## [1] "Democrat conidered as a republican on index"
## [1] 51
## [1] "Democrat conidered as a republican on index"
## [1] 53
## [1] "Democrat conidered as a republican on index"
## [1] 57
## [1] "Democrat conidered as a republican on index"
## [1] 58
## [1] "Democrat conidered as a republican on index"
## [1] 59
## [1] "Democrat conidered as a republican on index"
## [1] 61
## [1] "Democrat conidered as a republican on index"
## [1] 65
## [1] "Democrat conidered as a republican on index"
## [1] 71
## [1] "Democrat conidered as a republican on index"
## [1] 74
## [1] "Democrat conidered as a republican on index"
## [1] 76
## [1] "Democrat conidered as a republican on index"
## [1] 77
## [1] "Democrat conidered as a republican on index"
## [1] 78
## [1] "Democrat conidered as a republican on index"
## [1] 82
## [1] "Democrat conidered as a republican on index"
## [1] 85
## [1] "Democrat conidered as a republican on index"
## [1] 86
## [1] "Democrat conidered as a republican on index"
## [1] 87
## [1] "Democrat conidered as a republican on index"
## [1] 89
## [1] "Democrat conidered as a republican on index"
## [1] 90
## [1] "Democrat conidered as a republican on index"
## [1] 91
## [1] "Democrat conidered as a republican on index"
## [1] 92
## [1] "Democrat conidered as a republican on index"
## [1] 93
## [1] "Democrat conidered as a republican on index"
## [1] 94
## [1] "Democrat conidered as a republican on index"
## [1] 95
## [1] "Democrat conidered as a republican on index"
## [1] 103
## [1] "Democrat conidered as a republican on index"
## [1] 105
## [1] "Democrat conidered as a republican on index"
## [1] 106
conf_hclust_average
## [1] 57  0 48  1

By the above observations we can see, the below observations, persist.

[1] Republican conidered as a democrat on index - 1 [2] Democrat conidered as a republican on index - 38 [3] Democrat conidered as a republican on index - 39 [4] Democrat conidered as a republican on index - 65 [5] Democrat conidered as a republican on index - 95

  1. Compute the purity and entropy for these clustering results with respect to the senators party label. You will generate a 2x4 table as follows:
cluster.purity <- function(clusters, classes) {
 sum(apply(table(classes, clusters), 2, max)) /
length(clusters)
}
cluster.entropy <- function(clusters,classes) {
 en <- function(x) {
 s = sum(x)
 sum(sapply(x/s, function(p) {if (p) -p*log2(p)
else 0} ) )
 }
 M = table(classes, clusters)
 m = apply(M, 2, en)
 c = colSums(M) / sum(M)
 sum(m*c)
}
kmeans_purity=cluster.purity(as.matrix(raw_data["cluster_kmeans"]),as.matrix(raw_data["party"]))
kmeans_entropy=cluster.entropy(as.matrix(raw_data["cluster_kmeans"]),as.matrix(raw_data["party"]))
kmeans=c(kmeans_purity,kmeans_entropy)
hclust_single_purity=cluster.purity(as.matrix(raw_data["cluster_hclust_single"]),as.matrix(raw_data["party"]))
hclust_single_entropy=cluster.entropy(as.matrix(raw_data["cluster_hclust_single"]),as.matrix(raw_data["party"]))
hclust_single=c(hclust_single_purity,hclust_single_entropy)
hclust_complete_purity=cluster.purity(as.matrix(raw_data["cluster_hclust_complete"]),as.matrix(raw_data["party"]))
hclust_complete_entropy=cluster.entropy(as.matrix(raw_data["cluster_hclust_complete"]),as.matrix(raw_data["party"]))
hclust_complete=c(hclust_complete_purity,hclust_complete_entropy)
hclust_average_purity=cluster.purity(as.matrix(raw_data["cluster_hclust_average"]),as.matrix(raw_data["party"]))
hclust_average_entropy=cluster.entropy(as.matrix(raw_data["cluster_hclust_average"]),as.matrix(raw_data["party"]))
hclust_average=c(hclust_average_purity,hclust_average_entropy)
dF <- data.frame("kmeans"=kmeans,"hclust_single"=hclust_single,"hclust_complete"=hclust_complete,"hclust_average"=hclust_average)
rownames(dF)= c("purity","entropy")
dF
##            kmeans hclust_single hclust_complete hclust_average
## purity  0.9528302     0.5471698       0.9433962      0.5471698
## entropy 0.3039495     1.0984641       0.3473223      1.0984641
  1. Based on your observaton on both measures and mis-classified members, choose two clustering methods that generate the most meaningful results and explain why.

By looking at the purity and Entropy from all the four methods, we can see that the purity is the most in the kmeans and hierarchial clustering with complte link. Also, the Entropy is the least for these two algorithms.

Not only the purity and entropy but we could check that the False positives and false negatives are the least for these two.